<!DOCTYPE HTML>
<html lang="en" >
    <!-- Start book Python爬虫课程讲义 -->
    <head>
        <!-- head:start -->
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <title>处理一些格式规范的文字 | Python爬虫课程讲义</title>
        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
        <meta name="description" content="">
        <meta name="generator" content="GitBook 2.6.7">
        <meta name="author" content="BigCat">
        
        <meta name="HandheldFriendly" content="true"/>
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
        <meta name="apple-mobile-web-app-capable" content="yes">
        <meta name="apple-mobile-web-app-status-bar-style" content="black">
        <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
        <link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
        
    <link rel="stylesheet" href="../../gitbook/style.css">
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-tbfed-pagefooter/footer.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-splitter/splitter.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-highlight/website.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-fontsettings/website.css">
        
    
    

        
    
    
    <link rel="next" href="../../file/part03/3.8.html" />
    
    
    <link rel="prev" href="../../file/part03/3.6.html" />
    

        <!-- head:end -->
    </head>
    <body>
        <!-- body:start -->
        
    <div class="book"
        data-level="3.7"
        data-chapter-title="处理一些格式规范的文字"
        data-filepath="file/part03/3.7.md"
        data-basepath="../.."
        data-revision="Thu Feb 09 2017 09:48:59 GMT+0800 (CST)"
        data-innerlanguage="">
    

<div class="book-summary">
    <nav role="navigation">
        <ul class="summary">
            
            
            
            

            

            
    
        <li class="chapter " data-level="0" data-path="index.html">
            
                
                    <a href="../../index.html">
                
                        <i class="fa fa-check"></i>
                        
                        传智播客Python学院爬虫课程
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1" data-path="file/part01/1.html">
            
                
                    <a href="../../file/part01/1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.</b>
                        
                        爬虫原理与数据抓取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="1.1" data-path="file/part01/1.1.html">
            
                
                    <a href="../../file/part01/1.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.1.</b>
                        
                        (了解)通用爬虫和聚焦爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.2" data-path="file/part01/1.2.html">
            
                
                    <a href="../../file/part01/1.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.2.</b>
                        
                        (复习)HTTP/HTTPS的请求与响应
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.3" data-path="file/part01/1.3.html">
            
                
                    <a href="../../file/part01/1.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.3.</b>
                        
                        HTTP/HTTPS抓包工具-Fiddler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.4" data-path="file/part01/1.4.html">
            
                
                    <a href="../../file/part01/1.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.4.</b>
                        
                        urllib2模块的基本使用
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.5" data-path="file/part01/1.5.html">
            
                
                    <a href="../../file/part01/1.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.5.</b>
                        
                        urllib2：GET请求和POST请求
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.6" data-path="file/part01/1.6.html">
            
                
                    <a href="../../file/part01/1.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.6.</b>
                        
                        urllib2：Handler处理器和自定义Opener
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.7" data-path="file/part01/1.7.html">
            
                
                    <a href="../../file/part01/1.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.7.</b>
                        
                        urllib2：URLError与HTTPError
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.8" data-path="file/part01/1.8.html">
            
                
                    <a href="../../file/part01/1.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.8.</b>
                        
                        Requests模块
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="2" data-path="file/part02/2.html">
            
                
                    <a href="../../file/part02/2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.</b>
                        
                        非结构化数据与结构化数据提取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="2.1" data-path="file/part02/2.1.html">
            
                
                    <a href="../../file/part02/2.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.1.</b>
                        
                        正则表达式re模块
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.2" data-path="file/part02/2.2.html">
            
                
                    <a href="../../file/part02/2.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.2.</b>
                        
                        案例：使用正则表达式的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.3" data-path="file/part02/2.3.html">
            
                
                    <a href="../../file/part02/2.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.3.</b>
                        
                        XPath与lxml类库
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.4" data-path="file/part02/2.4.html">
            
                
                    <a href="../../file/part02/2.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.4.</b>
                        
                        案例：使用XPath的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.5" data-path="file/part02/2.5.html">
            
                
                    <a href="../../file/part02/2.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.5.</b>
                        
                        BeautifulSoup4 解析器
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.6" data-path="file/part02/2.6.html">
            
                
                    <a href="../../file/part02/2.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.6.</b>
                        
                        案例：使用bs4的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.7" data-path="file/part02/2.7.html">
            
                
                    <a href="../../file/part02/2.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.7.</b>
                        
                        JSON模块与JsonPath
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.8" data-path="file/part02/2.8.html">
            
                
                    <a href="../../file/part02/2.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.8.</b>
                        
                        糗事百科案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.9" data-path="file/part02/2.9.html">
            
                
                    <a href="../../file/part02/2.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.9.</b>
                        
                        多线程爬虫案例
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="3" data-path="file/part03/3.html">
            
                
                    <a href="../../file/part03/3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.</b>
                        
                        动态HTML处理和机器图像识别
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="3.1" data-path="file/part03/3.1.html">
            
                
                    <a href="../../file/part03/3.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.1.</b>
                        
                        动态HTML介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.2" data-path="file/part03/3.2.html">
            
                
                    <a href="../../file/part03/3.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.2.</b>
                        
                        Selenium与PhantomJS
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.3" data-path="file/part03/3.3.html">
            
                
                    <a href="../../file/part03/3.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.3.</b>
                        
                        案例一：网站模拟登录
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.4" data-path="file/part03/3.4.html">
            
                
                    <a href="../../file/part03/3.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.4.</b>
                        
                        案例二：动态页面模拟点击
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.5" data-path="file/part03/3.5.html">
            
                
                    <a href="../../file/part03/3.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.5.</b>
                        
                        案例三：执行JavaScript语句
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.6" data-path="file/part03/3.6.html">
            
                
                    <a href="../../file/part03/3.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.6.</b>
                        
                        机器视觉与Tesseract介绍
                    </a>
            
            
        </li>
    
        <li class="chapter active" data-level="3.7" data-path="file/part03/3.7.html">
            
                
                    <a href="../../file/part03/3.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.7.</b>
                        
                        处理一些格式规范的文字
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.8" data-path="file/part03/3.8.html">
            
                
                    <a href="../../file/part03/3.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.8.</b>
                        
                        案例：尝试对验证码进行机器识别处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.9" data-path="file/part03/3.9.html">
            
                
                    <a href="../../file/part03/3.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.9.</b>
                        
                        机器学习：训练Tesseract
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="4" data-path="file/part04/4.html">
            
                
                    <a href="../../file/part04/4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.</b>
                        
                        Scrapy框架
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="4.1" data-path="file/part04/4.1.html">
            
                
                    <a href="../../file/part04/4.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.1.</b>
                        
                        配置安装
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.2" data-path="file/part04/4.2.html">
            
                
                    <a href="../../file/part04/4.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.2.</b>
                        
                        入门案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.3" data-path="file/part04/4.3.html">
            
                
                    <a href="../../file/part04/4.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.3.</b>
                        
                        Scrapy Shell
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.4" data-path="file/part04/4.4.html">
            
                
                    <a href="../../file/part04/4.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.4.</b>
                        
                        Item Pipeline
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.5" data-path="file/part04/4.5.html">
            
                
                    <a href="../../file/part04/4.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.5.</b>
                        
                        Spiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.6" data-path="file/part04/4.6.html">
            
                
                    <a href="../../file/part04/4.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.6.</b>
                        
                        CrawlSpiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.7" data-path="file/part04/4.7.html">
            
                
                    <a href="../../file/part04/4.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.7.</b>
                        
                        Request/Response
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.8" data-path="file/part04/4.8.html">
            
                
                    <a href="../../file/part04/4.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.8.</b>
                        
                        Downloader Middlewares
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.9" data-path="file/part04/4.9.html">
            
                
                    <a href="../../file/part04/4.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.9.</b>
                        
                        Settings
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="5" data-path="file/part05/5.html">
            
                
                    <a href="../../file/part05/5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.</b>
                        
                        Scrapy实战项目
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="5.1" data-path="file/part05/5.1.html">
            
                
                    <a href="../../file/part05/5.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.1.</b>
                        
                        (案例一)手机App抓包爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.2" data-path="file/part05/5.2.html">
            
                
                    <a href="../../file/part05/5.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.2.</b>
                        
                        (案例二)阳光热线问政平台爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.3" data-path="file/part05/5.3.html">
            
                
                    <a href="../../file/part05/5.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.3.</b>
                        
                        (案例三)新浪网分类资讯爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.4" data-path="file/part05/5.4.html">
            
                
                    <a href="../../file/part05/5.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.4.</b>
                        
                        (案例四)图片下载器爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.5" data-path="file/part05/5.5.html">
            
                
                    <a href="../../file/part05/5.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.5.</b>
                        
                        (案例五)将数据保存在MongoDB中
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.6" data-path="file/part05/5.6.html">
            
                
                    <a href="../../file/part05/5.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.6.</b>
                        
                        (案例六)三种scrapy模拟登陆策略
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.7" data-path="file/part05/5.7.html">
            
                
                    <a href="../../file/part05/5.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.7.</b>
                        
                        附：通过Fiddler进行手机抓包方法
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="6" data-path="file/part06/6.html">
            
                
                    <a href="../../file/part06/6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.</b>
                        
                        scrapy-redis分布式组件
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="6.1" data-path="file/part06/6.1.html">
            
                
                    <a href="../../file/part06/6.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.1.</b>
                        
                        源码分析参考：Connection
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.2" data-path="file/part06/6.2.html">
            
                
                    <a href="../../file/part06/6.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.2.</b>
                        
                        源码分析参考：Dupefilter
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.3" data-path="file/part06/6.3.html">
            
                
                    <a href="../../file/part06/6.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.3.</b>
                        
                        源码分析参考：Picklecompat
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.4" data-path="file/part06/6.4.html">
            
                
                    <a href="../../file/part06/6.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.4.</b>
                        
                        源码分析参考：Pipelines
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.5" data-path="file/part06/6.5.html">
            
                
                    <a href="../../file/part06/6.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.5.</b>
                        
                        源码分析参考：Queue
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.6" data-path="file/part06/6.6.html">
            
                
                    <a href="../../file/part06/6.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.6.</b>
                        
                        源码分析参考：Scheduler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.7" data-path="file/part06/6.7.html">
            
                
                    <a href="../../file/part06/6.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.7.</b>
                        
                        源码分析参考：Spider
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="7" data-path="file/part07/7.html">
            
                
                    <a href="../../file/part07/7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.</b>
                        
                        scrapy-redis实战
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="7.1" data-path="file/part07/7.1.html">
            
                
                    <a href="../../file/part07/7.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.1.</b>
                        
                        源码自带项目说明
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.2" data-path="file/part07/7.2.html">
            
                
                    <a href="../../file/part07/7.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.2.</b>
                        
                        有缘网分布式爬虫项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.3" data-path="file/part07/7.3.html">
            
                
                    <a href="../../file/part07/7.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.3.</b>
                        
                        有缘网分布式爬虫项目2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.4" data-path="file/part07/7.4.html">
            
                
                    <a href="../../file/part07/7.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.4.</b>
                        
                        处理Redis里的数据
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.5" data-path="file/part07/7.5.html">
            
                
                    <a href="../../file/part07/7.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.5.</b>
                        
                        尝试改写新浪网分类资讯爬虫1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.6" data-path="file/part07/7.6.html">
            
                
                    <a href="../../file/part07/7.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.6.</b>
                        
                        尝试改写新浪网分类资讯爬虫2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.7" data-path="file/part07/7.7.html">
            
                
                    <a href="../../file/part07/7.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.7.</b>
                        
                        IT桔子分布式项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.8" data-path="file/part07/7.8.html">
            
                
                    <a href="../../file/part07/7.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.8.</b>
                        
                        IT桔子分布式项目2
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="8" data-path="file/duanzi/duanzi.html">
            
                
                    <a href="../../file/duanzi/duanzi.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>8.</b>
                        
                        课余段子
                    </a>
            
            
        </li>
    


            
            <li class="divider"></li>
            <li>
                <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
                    Published with GitBook
                </a>
            </li>
            
        </ul>
    </nav>
</div>

    <div class="book-body">
        <div class="body-inner">
            <div class="book-header" role="navigation">
    <!-- Actions Left -->
    

    <!-- Title -->
    <h1>
        <i class="fa fa-circle-o-notch fa-spin"></i>
        <a href="../../" >Python爬虫课程讲义</a>
    </h1>
</div>

            <div class="page-wrapper" tabindex="-1" role="main">
                <div class="page-inner">
                
                
                    <section class="normal" id="section-">
                    
                        <h2 id="&#x5904;&#x7406;&#x7ED9;&#x89C4;&#x8303;&#x7684;&#x6587;&#x5B57;">&#x5904;&#x7406;&#x7ED9;&#x89C4;&#x8303;&#x7684;&#x6587;&#x5B57;</h2>
<p>&#x4F60;&#x8981;&#x5904;&#x7406;&#x7684;&#x5927;&#x591A;&#x6570;&#x6587;&#x5B57;&#x90FD;&#x662F;&#x6BD4;&#x8F83;&#x5E72;&#x51C0;&#x3001;&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x7684;&#x3002;&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x7684;&#x6587;&#x5B57;&#x901A;&#x5E38;&#x53EF;&#x4EE5;&#x6EE1;&#x8DB3;&#x4E00;&#x4E9B;&#x9700;&#x6C42;,&#x4E0D;&#x8FC7;&#x7A76;&#x7ADF;&#x4EC0;&#x4E48;&#x662F;&#x201C;&#x683C;&#x5F0F;&#x6DF7;&#x4E71;&#x201D;,&#x4EC0;&#x4E48;&#x7B97;&#x201C;&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x201D;,&#x786E;&#x5B9E;&#x56E0;&#x4EBA;&#x800C;&#x5F02;&#x3002; &#x901A;&#x5E38;,&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x7684;&#x6587;&#x5B57;&#x5177;&#x6709;&#x4EE5;&#x4E0B;&#x7279;&#x70B9;:</p>
<ul>
<li>&#x4F7F;&#x7528;&#x4E00;&#x4E2A;&#x6807;&#x51C6;&#x5B57;&#x4F53;(&#x4E0D;&#x5305;&#x542B;&#x624B;&#x5199;&#x4F53;&#x3001;&#x8349;&#x4E66;,&#x6216;&#x8005;&#x5341;&#x5206;&#x201C;&#x82B1;&#x54E8;&#x7684;&#x201D;&#x5B57;&#x4F53;) &#x2022; &#x867D;&#x7136;&#x88AB;&#x590D;&#x5370;&#x6216;&#x62CD;&#x7167;,&#x5B57;&#x4F53;&#x8FD8;&#x662F;&#x5F88;&#x6E05;&#x6670;,&#x6CA1;&#x6709;&#x591A;&#x4F59;&#x7684;&#x75D5;&#x8FF9;&#x6216;&#x6C61;&#x70B9;</li>
<li>&#x6392;&#x5217;&#x6574;&#x9F50;,&#x6CA1;&#x6709;&#x6B6A;&#x6B6A;&#x659C;&#x659C;&#x7684;&#x5B57;</li>
<li>&#x6CA1;&#x6709;&#x8D85;&#x51FA;&#x56FE;&#x7247;&#x8303;&#x56F4;,&#x4E5F;&#x6CA1;&#x6709;&#x6B8B;&#x7F3A;&#x4E0D;&#x5168;,&#x6216;&#x7D27;&#x7D27;&#x8D34;&#x5728;&#x56FE;&#x7247;&#x7684;&#x8FB9;&#x7F18;</li>
</ul>
<p>&#x6587;&#x5B57;&#x7684;&#x4E00;&#x4E9B;&#x683C;&#x5F0F;&#x95EE;&#x9898;&#x5728;&#x56FE;&#x7247;&#x9884;&#x5904;&#x7406;&#x65F6;&#x53EF;&#x4EE5;&#x8FDB;&#x884C;&#x89E3;&#x51B3;&#x3002;&#x4F8B;&#x5982;,&#x53EF;&#x4EE5;&#x628A;&#x56FE;&#x7247;&#x8F6C;&#x6362;&#x6210;&#x7070;&#x5EA6;&#x56FE;,&#x8C03; &#x6574;&#x4EAE;&#x5EA6;&#x548C;&#x5BF9;&#x6BD4;&#x5EA6;,&#x8FD8;&#x53EF;&#x4EE5;&#x6839;&#x636E;&#x9700;&#x8981;&#x8FDB;&#x884C;&#x88C1;&#x526A;&#x548C;&#x65CB;&#x8F6C;&#xFF08;&#x8BE6;&#x60C5;&#x8BF7;&#x5173;&#x6CE8;&#x56FE;&#x50CF;&#x4E0E;&#x4FE1;&#x53F7;&#x5904;&#x7406;&#xFF09;&#xFF0C;&#x4F46;&#x662F;,&#x8FD9;&#x4E9B;&#x505A;&#x6CD5;&#x5728;&#x8FDB;&#x884C;&#x66F4;&#x5177;&#x6269;&#x5C55;&#x6027;&#x7684; &#x8BAD;&#x7EC3;&#x65F6;&#x4F1A;&#x9047;&#x5230;&#x4E00;&#x4E9B;&#x9650;&#x5236;&#x3002;</p>
<h3 id="&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x6587;&#x5B57;&#x7684;&#x7406;&#x60F3;&#x793A;&#x4F8B;">&#x683C;&#x5F0F;&#x89C4;&#x8303;&#x6587;&#x5B57;&#x7684;&#x7406;&#x60F3;&#x793A;&#x4F8B;</h3>
<p><img src="../images/tesseracttest.jpg" alt=""></p>
<p>&#x901A;&#x8FC7;&#x4E0B;&#x9762;&#x7684;&#x547D;&#x4EE4;&#x8FD0;&#x884C; Tesseract&#xFF0C;&#x8BFB;&#x53D6;&#x6587;&#x4EF6;&#x5E76;&#x628A;&#x7ED3;&#x679C;&#x5199;&#x5230;&#x4E00;&#x4E2A;&#x6587;&#x672C;&#x6587;&#x4EF6;&#x4E2D;:
`tesseract test.jpg text</p>
<p><img src="../images/tesseract_test.png" alt=""></p>
<p><code>cat text.txt</code> &#x5373;&#x53EF;&#x663E;&#x793A;&#x7ED3;&#x679C;&#x3002;</p>
<p>&#x8BC6;&#x522B;&#x7ED3;&#x679C;&#x5F88;&#x51C6;&#x786E;,&#x4E0D;&#x8FC7;&#x7B26;&#x53F7;<code>^</code>&#x548C;<code>*</code>&#x5206;&#x522B;&#x88AB;&#x8868;&#x793A;&#x6210;&#x4E86;&#x53CC;&#x5F15;&#x53F7;&#x548C;&#x5355;&#x5F15;&#x53F7;&#x3002;&#x5927;&#x4F53;&#x4E0A;&#x53EF;&#x4EE5;&#x8BA9;&#x4F60;&#x5F88;&#x8212;&#x670D;&#x5730;&#x9605;&#x8BFB;&#x3002;</p>
<h3 id="&#x901A;&#x8FC7;python&#x4EE3;&#x7801;&#x5B9E;&#x73B0;">&#x901A;&#x8FC7;Python&#x4EE3;&#x7801;&#x5B9E;&#x73B0;</h3>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> pytesseract
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image

image = Image.open(<span class="hljs-string">&apos;test.jpg&apos;</span>)
text = pytesseract.image_to_string(image)
<span class="hljs-keyword">print</span> text
</code></pre>
<p>&#x8FD0;&#x884C;&#x7ED3;&#x679C;&#xFF1A;</p>
<pre><code>This is some text, written in Arial, that will be read by
Tesseract. Here are some symbols: !@#$%&quot;&amp;*()
</code></pre><h3 id="&#x5BF9;&#x56FE;&#x7247;&#x8FDB;&#x884C;&#x9608;&#x503C;&#x8FC7;&#x6EE4;&#x548C;&#x964D;&#x566A;&#x5904;&#x7406;&#xFF08;&#x4E86;&#x89E3;&#x5373;&#x53EF;&#xFF09;">&#x5BF9;&#x56FE;&#x7247;&#x8FDB;&#x884C;&#x9608;&#x503C;&#x8FC7;&#x6EE4;&#x548C;&#x964D;&#x566A;&#x5904;&#x7406;&#xFF08;&#x4E86;&#x89E3;&#x5373;&#x53EF;&#xFF09;</h3>
<p>&#x5F88;&#x591A;&#x65F6;&#x5019;&#x6211;&#x4EEC;&#x5728;&#x7F51;&#x4E0A;&#x4F1A;&#x770B;&#x5230;&#x8FD9;&#x6837;&#x7684;&#x56FE;&#x7247;&#xFF1A;</p>
<p><img src="../images/tess2.jpg" alt=""></p>
<p>Tesseract &#x4E0D;&#x80FD;&#x5B8C;&#x6574;&#x5904;&#x7406;&#x8FD9;&#x4E2A;&#x56FE;&#x7247;,&#x4E3B;&#x8981;&#x662F;&#x56E0;&#x4E3A;&#x56FE;&#x7247;&#x80CC;&#x666F;&#x8272;&#x662F;&#x6E10;&#x53D8;&#x7684;,&#x6700;&#x7EC8;&#x7ED3;&#x679C;&#x662F;&#x8FD9;&#x6837;:</p>
<p><img src="../images/tesseracttesst.png" alt=""></p>
<p>&#x968F;&#x7740;&#x80CC;&#x666F;&#x8272;&#x4ECE;&#x5DE6;&#x5230;&#x53F3;&#x4E0D;&#x65AD;&#x52A0;&#x6DF1;,&#x6587;&#x5B57;&#x53D8;&#x5F97;&#x8D8A;&#x6765;&#x8D8A;&#x96BE;&#x4EE5;&#x8BC6;&#x522B;,Tesseract &#x8BC6;&#x522B;&#x51FA;&#x7684; &#x6BCF;&#x4E00;&#x884C;&#x7684;&#x6700;&#x540E;&#x51E0;&#x4E2A;&#x5B57;&#x7B26;&#x90FD;&#x662F;&#x9519;&#x7684;&#x3002;</p>
<p>&#x9047;&#x5230;&#x8FD9;&#x7C7B;&#x95EE;&#x9898;,&#x53EF;&#x4EE5;&#x5148;&#x7528; Python &#x811A;&#x672C;&#x5BF9;&#x56FE;&#x7247;&#x8FDB;&#x884C;&#x6E05;&#x7406;&#x3002;&#x5229;&#x7528; Pillow &#x5E93;,&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x521B;&#x5EFA;&#x4E00;&#x4E2A; &#x9608;&#x503C;&#x8FC7;&#x6EE4;&#x5668;&#x6765;&#x53BB;&#x6389;&#x6E10;&#x53D8;&#x7684;&#x80CC;&#x666F;&#x8272;,&#x53EA;&#x628A;&#x6587;&#x5B57;&#x7559;&#x4E0B;&#x6765;,&#x4ECE;&#x800C;&#x8BA9;&#x56FE;&#x7247;&#x66F4;&#x52A0;&#x6E05;&#x6670;,&#x4FBF;&#x4E8E; Tesseract &#x8BFB;&#x53D6;:</p>
<pre><code class="lang-python"><span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image 
<span class="hljs-keyword">import</span> subprocess

<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">cleanFile</span><span class="hljs-params">(filePath, newFilePath)</span>:</span> 
    image = Image.open(filePath)

    <span class="hljs-comment"># &#x5BF9;&#x56FE;&#x7247;&#x8FDB;&#x884C;&#x9608;&#x503C;&#x8FC7;&#x6EE4;,&#x7136;&#x540E;&#x4FDD;&#x5B58;</span>
    image = image.point(<span class="hljs-keyword">lambda</span> x: <span class="hljs-number">0</span> <span class="hljs-keyword">if</span> x&lt;<span class="hljs-number">143</span> <span class="hljs-keyword">else</span> <span class="hljs-number">255</span>)     
    image.save(newFilePath)

    <span class="hljs-comment"># &#x8C03;&#x7528;&#x7CFB;&#x7EDF;&#x7684;tesseract&#x547D;&#x4EE4;&#x5BF9;&#x56FE;&#x7247;&#x8FDB;&#x884C;OCR&#x8BC6;&#x522B;     </span>
    subprocess.call([<span class="hljs-string">&quot;tesseract&quot;</span>, newFilePath, <span class="hljs-string">&quot;output&quot;</span>])

    <span class="hljs-comment"># &#x6253;&#x5F00;&#x6587;&#x4EF6;&#x8BFB;&#x53D6;&#x7ED3;&#x679C;</span>
    file = open(<span class="hljs-string">&quot;output.txt&quot;</span>, <span class="hljs-string">&apos;r&apos;</span>)     
    print(file.read()) 
    file.close()

cleanFile(<span class="hljs-string">&quot;text2.jpg&quot;</span>, <span class="hljs-string">&quot;text2clean.png&quot;</span>)
</code></pre>
<p>&#x901A;&#x8FC7;&#x4E00;&#x4E2A;&#x9608;&#x503C;&#x5BF9;&#x524D;&#x9762;&#x7684;&#x201C;&#x6A21;&#x7CCA;&#x201D;&#x56FE;&#x7247;&#x8FDB;&#x884C;&#x8FC7;&#x6EE4;&#x7684;&#x7ED3;&#x679C;</p>
<p><img src="../images/tess2clean.png" alt=""></p>
<p>&#x9664;&#x4E86;&#x4E00;&#x4E9B;&#x6807;&#x70B9;&#x7B26;&#x53F7;&#x4E0D;&#x592A;&#x6E05;&#x6670;&#x6216;&#x4E22;&#x5931;&#x4E86;,&#x5927;&#x90E8;&#x5206;&#x6587;&#x5B57;&#x90FD;&#x88AB;&#x8BFB;&#x51FA;&#x6765;&#x4E86;&#x3002;Tesseract &#x7ED9;&#x51FA;&#x4E86;&#x6700;&#x597D;&#x7684; &#x7ED3;&#x679C;:</p>
<p><img src="../images/tesseracttest3.png" alt=""></p>
<h2 id="&#x4ECE;&#x7F51;&#x7AD9;&#x56FE;&#x7247;&#x4E2D;&#x6293;&#x53D6;&#x6587;&#x5B57;">&#x4ECE;&#x7F51;&#x7AD9;&#x56FE;&#x7247;&#x4E2D;&#x6293;&#x53D6;&#x6587;&#x5B57;</h2>
<p>&#x7528; Tesseract &#x8BFB;&#x53D6;&#x786C;&#x76D8;&#x91CC;&#x56FE;&#x7247;&#x4E0A;&#x7684;&#x6587;&#x5B57;,&#x53EF;&#x80FD;&#x4E0D;&#x600E;&#x4E48;&#x4EE4;&#x4EBA;&#x5174;&#x594B;,&#x4F46;&#x5F53;&#x6211;&#x4EEC;&#x628A;&#x5B83;&#x548C;&#x7F51;&#x7EDC;&#x722C;&#x866B;&#x7EC4;&#x5408;&#x4F7F;&#x7528;&#x65F6;,&#x5C31;&#x80FD;&#x6210;&#x4E3A;&#x4E00;&#x4E2A;&#x5F3A;&#x5927;&#x7684;&#x5DE5;&#x5177;&#x3002;</p>
<p>&#x7F51;&#x7AD9;&#x4E0A;&#x7684;&#x56FE;&#x7247;&#x53EF;&#x80FD;&#x5E76;&#x4E0D;&#x662F;&#x6545;&#x610F;&#x628A;&#x6587;&#x5B57;&#x505A;&#x5F97;&#x5F88;&#x82B1;&#x54E8; (&#x5C31;&#x50CF;&#x9910;&#x9986;&#x83DC;&#x5355;&#x7684; JPG &#x56FE;&#x7247;&#x4E0A;&#x7684;&#x827A;&#x672F;&#x5B57;),&#x4F46;&#x5B83;&#x4EEC;&#x4E0A;&#x9762;&#x7684;&#x6587;&#x5B57;&#x5BF9;&#x7F51;&#x7EDC;&#x722C;&#x866B;&#x6765;&#x8BF4;&#x5C31;&#x662F;&#x9690;&#x85CF;&#x8D77;&#x6765;
&#x4E86;&#xFF0C;&#x4E3E;&#x4E2A;&#x4F8B;&#x5B50;&#xFF1A;</p>
<ul>
<li><p>&#x867D;&#x7136;&#x4E9A;&#x9A6C;&#x900A;&#x7684; robots.txt &#x6587;&#x4EF6;&#x5141;&#x8BB8;&#x6293;&#x53D6;&#x7F51;&#x7AD9;&#x7684;&#x4EA7;&#x54C1;&#x9875;&#x9762;,&#x4F46;&#x662F;&#x56FE;&#x4E66;&#x7684;&#x9884;&#x89C8;&#x9875;&#x901A;&#x5E38;&#x4E0D;&#x8BA9;&#x7F51;&#x7EDC;&#x673A; &#x5668;&#x4EBA;&#x91C7;&#x96C6;&#x3002;</p>
</li>
<li><p>&#x56FE;&#x4E66;&#x7684;&#x9884;&#x89C8;&#x9875;&#x662F;&#x901A;&#x8FC7;&#x7528;&#x6237;&#x89E6;&#x53D1; Ajax &#x811A;&#x672C;&#x8FDB;&#x884C;&#x52A0;&#x8F7D;&#x7684;,&#x9884;&#x89C8;&#x56FE;&#x7247;&#x9690;&#x85CF;&#x5728; div &#x8282;&#x70B9; &#x4E0B;&#x9762;;&#x5176;&#x5B9E;,&#x666E;&#x901A;&#x7684;&#x8BBF;&#x95EE;&#x8005;&#x4F1A;&#x89C9;&#x5F97;&#x5B83;&#x4EEC;&#x770B;&#x8D77;&#x6765;&#x66F4;&#x50CF;&#x662F;&#x4E00;&#x4E2A; Flash &#x52A8;&#x753B;,&#x800C;&#x4E0D;&#x662F;&#x4E00;&#x4E2A;&#x56FE;&#x7247;&#x6587; &#x4EF6;&#x3002;&#x5F53;&#x7136;,&#x5373;&#x4F7F;&#x6211;&#x4EEC;&#x80FD;&#x83B7;&#x5F97;&#x56FE;&#x7247;,&#x8981;&#x628A;&#x5B83;&#x4EEC;&#x8BFB;&#x6210;&#x6587;&#x5B57;&#x4E5F;&#x6CA1;&#x90A3;&#x4E48;&#x7B80;&#x5355;&#x3002;</p>
</li>
<li><p>&#x4E0B;&#x9762;&#x7684;&#x7A0B;&#x5E8F;&#x5C31;&#x89E3;&#x51B3;&#x4E86;&#x8FD9;&#x4E2A;&#x95EE;&#x9898;:&#x9996;&#x5148;&#x5BFC;&#x822A;&#x5230;&#x6258;&#x5C14;&#x65AF;&#x6CF0;&#x7684;&#x300A;&#x6218;&#x4E89;&#x4E0E;&#x548C;&#x5E73;&#x300B;&#x7684;&#x5927;&#x5B57;&#x53F7;&#x5370;&#x5237;&#x7248; 1, &#x6253;&#x5F00;&#x9605;&#x8BFB;&#x5668;,&#x6536;&#x96C6;&#x56FE;&#x7247;&#x7684; URL &#x94FE;&#x63A5;,&#x7136;&#x540E;&#x4E0B;&#x8F7D;&#x56FE;&#x7247;,&#x8BC6;&#x522B;&#x56FE;&#x7247;,&#x6700;&#x540E;&#x6253;&#x5370;&#x6BCF;&#x4E2A;&#x56FE;&#x7247;&#x7684;&#x6587; &#x5B57;&#x3002;&#x56E0;&#x4E3A;&#x8FD9;&#x4E2A;&#x7A0B;&#x5E8F;&#x5F88;&#x590D;&#x6742;,&#x5229;&#x7528;&#x4E86;&#x524D;&#x9762;&#x51E0;&#x7AE0;&#x7684;&#x591A;&#x4E2A;&#x7A0B;&#x5E8F;&#x7247;&#x6BB5;,&#x6240;&#x4EE5;&#x6211;&#x589E;&#x52A0;&#x4E86;&#x4E00;&#x4E9B;&#x6CE8;&#x91CA;&#x4EE5;&#x8BA9; &#x6BCF;&#x6BB5;&#x4EE3;&#x7801;&#x7684;&#x76EE;&#x7684;&#x66F4;&#x52A0;&#x6E05;&#x6670;:</p>
</li>
</ul>
<pre><code class="lang-python">
<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">from</span> urllib.request <span class="hljs-keyword">import</span> urlretrieve 
<span class="hljs-keyword">import</span> subprocess
<span class="hljs-keyword">from</span> selenium <span class="hljs-keyword">import</span> webdriver
<span class="hljs-comment">#&#x521B;&#x5EFA;&#x65B0;&#x7684;Selenium driver</span>
driver = webdriver.PhantomJS()

<span class="hljs-comment"># &#x7528;Selenium&#x8BD5;&#x8BD5;Firefox&#x6D4F;&#x89C8;&#x5668;:</span>
<span class="hljs-comment"># driver = webdriver.Firefox()</span>

driver.get(<span class="hljs-string">&quot;http://www.amazon.com/War-Peace-Leo-Nikolayevich-Tolstoy/dp/1427030200&quot;</span>)
<span class="hljs-comment"># &#x5355;&#x51FB;&#x56FE;&#x4E66;&#x9884;&#x89C8;&#x6309;&#x94AE; driver.find_element_by_id(&quot;sitbLogoImg&quot;).click() imageList = set()</span>
<span class="hljs-comment"># &#x7B49;&#x5F85;&#x9875;&#x9762;&#x52A0;&#x8F7D;&#x5B8C;&#x6210;</span>
time.sleep(<span class="hljs-number">5</span>)
<span class="hljs-comment"># &#x5F53;&#x5411;&#x53F3;&#x7BAD;&#x5934;&#x53EF;&#x4EE5;&#x70B9;&#x51FB;&#x65F6;,&#x5F00;&#x59CB;&#x7FFB;&#x9875;</span>
<span class="hljs-keyword">while</span> <span class="hljs-string">&quot;pointer&quot;</span> <span class="hljs-keyword">in</span> driver.find_element_by_id(<span class="hljs-string">&quot;sitbReaderRightPageTurner&quot;</span>).get_attribute(<span class="hljs-string">&quot;style&quot;</span>):
    driver.find_element_by_id(<span class="hljs-string">&quot;sitbReaderRightPageTurner&quot;</span>).click()
    time.sleep(<span class="hljs-number">2</span>)
    <span class="hljs-comment"># &#x83B7;&#x53D6;&#x5DF2;&#x52A0;&#x8F7D;&#x7684;&#x65B0;&#x9875;&#x9762;(&#x4E00;&#x6B21;&#x53EF;&#x4EE5;&#x52A0;&#x8F7D;&#x591A;&#x4E2A;&#x9875;&#x9762;,&#x4F46;&#x662F;&#x91CD;&#x590D;&#x7684;&#x9875;&#x9762;&#x4E0D;&#x80FD;&#x52A0;&#x8F7D;&#x5230;&#x96C6;&#x5408;&#x4E2D;) </span>
    pages = driver.find_elements_by_xpath(<span class="hljs-string">&quot;//div[@class=&apos;pageImage&apos;]/div/img&quot;</span>) 
    <span class="hljs-keyword">for</span> page <span class="hljs-keyword">in</span> pages:
        image = page.get_attribute(<span class="hljs-string">&quot;src&quot;</span>)
        imageList.add(image)
driver.quit()

<span class="hljs-comment"># &#x7528;Tesseract&#x5904;&#x7406;&#x6211;&#x4EEC;&#x6536;&#x96C6;&#x7684;&#x56FE;&#x7247;URL&#x94FE;&#x63A5; </span>
<span class="hljs-keyword">for</span> image <span class="hljs-keyword">in</span> sorted(imageList):
    <span class="hljs-comment"># &#x4FDD;&#x5B58;&#x56FE;&#x7247;</span>
    urlretrieve(image, <span class="hljs-string">&quot;page.jpg&quot;</span>)
    p = subprocess.Popen([<span class="hljs-string">&quot;tesseract&quot;</span>, <span class="hljs-string">&quot;page.jpg&quot;</span>, <span class="hljs-string">&quot;page&quot;</span>], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    f = open(<span class="hljs-string">&quot;page.txt&quot;</span>, <span class="hljs-string">&quot;r&quot;</span>)
    p.wait() print(f.read())
</code></pre>
<p>&#x548C;&#x6211;&#x4EEC;&#x524D;&#x9762;&#x4F7F;&#x7528; Tesseract &#x8BFB;&#x53D6;&#x7684;&#x6548;&#x679C;&#x4E00;&#x6837;,&#x8FD9;&#x4E2A;&#x7A0B;&#x5E8F;&#x4E5F;&#x4F1A;&#x5B8C;&#x7F8E;&#x5730;&#x6253;&#x5370;&#x4E66;&#x4E2D;&#x5F88;&#x591A;&#x957F;&#x957F;&#x7684;&#x6BB5; &#x843D;,&#x7B2C;&#x516D;&#x9875;&#x7684;&#x9884;&#x89C8;&#x5982;&#x4E0B;&#x6240;&#x793A;:</p>
<pre><code>6
     &quot;A word of friendly advice, mon
     cher. Be off as soon as you can,
     that&apos;s all I have to tell you. Happy
     he who has ears to hear. Good-by,
     my dear fellow. Oh, by the by!&quot; he
     shouted through the doorway after
     Pierre, &quot;is it true that the countess
     has fallen into the clutches of the
     holy fathers of the Society of je-
     sus?&quot;

     Pierre did not answer and left Ros-
     topchin&apos;s room more sullen and an-
     gry than he had ever before shown
     himself.
</code></pre><p>&#x4F46;&#x662F;,&#x5F53;&#x6587;&#x5B57;&#x51FA;&#x73B0;&#x5728;&#x5F69;&#x8272;&#x5C01;&#x9762;&#x4E0A;&#x65F6;,&#x7ED3;&#x679C;&#x5C31;&#x4E0D;&#x90A3;&#x4E48;&#x5B8C;&#x7F8E;&#x4E86;:</p>
<pre><code>   WEI&apos; nrrd Peace
   Len Nlkelayevldu Iolfluy
   Readmg shmdd be ax
   wlnvame asnossxble Wenfler
   an mm m our cram: Llhvary
    - Leo Tmsloy was a Russian rwovelwst
    I and moval phflmopher med lur
    A ms Ideas 01 nonviolenx reswslance m 5 We range     0, &quot;and&quot;
</code></pre><p>&#x5982;&#x679C;&#x60F3;&#x628A;&#x6587;&#x5B57;&#x52A0;&#x5DE5;&#x6210;&#x666E;&#x901A;&#x4EBA;&#x53EF;&#x4EE5;&#x770B;&#x61C2;&#x7684; &#x6548;&#x679C;,&#x8FD8;&#x9700;&#x8981;&#x82B1;&#x5F88;&#x591A;&#x65F6;&#x95F4;&#x53BB;&#x5904;&#x7406;&#x3002;</p>
<p>&#x4E0B;&#x4E00;&#x8282;&#x5C06;&#x4ECB;&#x7ECD;&#x53E6;&#x4E00;&#x79CD;&#x65B9;&#x6CD5;&#x6765;&#x89E3;&#x51B3;&#x6587;&#x5B57;&#x6DF7;&#x4E71;&#x7684;&#x95EE;&#x9898;,&#x5C24;&#x5176;&#x662F;&#x5F53;&#x4F60;&#x613F;&#x610F;&#x82B1;&#x4E00;&#x70B9;&#x513F;&#x65F6;&#x95F4;&#x8BAD;&#x7EC3; Tesseract &#x7684;&#x65F6;&#x5019;&#x3002;</p>
<p>&#x901A;&#x8FC7;&#x7ED9; Tesseract &#x63D0;&#x4F9B;&#x5927;&#x91CF;&#x5DF2;&#x77E5;&#x7684;&#x6587;&#x5B57;&#x4E0E;&#x56FE;&#x7247;&#x6620;&#x5C04;&#x96C6;,&#x7ECF;&#x8FC7;&#x8BAD;&#x7EC3; Tesseract &#x5C31;&#x53EF;&#x4EE5;&#x201C;&#x5B66;&#x4F1A;&#x201D;&#x8BC6;&#x522B;&#x540C;&#x4E00;&#x79CD;&#x5B57;&#x4F53;,&#x800C;&#x4E14;&#x53EF;&#x4EE5;&#x8FBE;&#x5230;&#x6781;&#x9AD8;&#x7684;&#x7CBE;&#x786E;&#x7387;&#x548C;&#x51C6;&#x786E;&#x7387;,&#x751A;&#x81F3;&#x53EF;&#x4EE5;&#x5FFD;&#x7565;&#x56FE; &#x7247;&#x4E2D;&#x6587;&#x5B57;&#x7684;&#x80CC;&#x666F;&#x8272;&#x548C;&#x76F8;&#x5BF9;&#x4F4D;&#x7F6E;&#x7B49;&#x95EE;&#x9898;&#x3002;</p>
<footer class="page-footer"><span class="copyright">Copyright &#xA9; BigCat all right reserved&#xFF0C;powered by Gitbook</span><span class="footer-modification">&#x300C;Revision Time:
2017-01-18 03:51:33&#x300D;
</span></footer>
                    
                    </section>
                
                
                </div>
            </div>
        </div>

        
        <a href="../../file/part03/3.6.html" class="navigation navigation-prev " aria-label="Previous page: 机器视觉与Tesseract介绍"><i class="fa fa-angle-left"></i></a>
        
        
        <a href="../../file/part03/3.8.html" class="navigation navigation-next " aria-label="Next page: 案例：尝试对验证码进行机器识别处理"><i class="fa fa-angle-right"></i></a>
        
    </div>
</div>

        
<script src="../../gitbook/app.js"></script>

    
    <script src="../../gitbook/plugins/gitbook-plugin-splitter/splitter.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-fontsettings/buttons.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-livereload/plugin.js"></script>
    

<script>
require(["gitbook"], function(gitbook) {
    var config = {"disqus":{"shortName":"gitbookuse"},"github":{"url":"https://github.com/dododream"},"search-pro":{"cutWordLib":"nodejieba","defineWord":["gitbook-use"]},"sharing":{"weibo":true,"facebook":true,"twitter":true,"google":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"tbfed-pagefooter":{"copyright":"Copyright © BigCat","modify_label":"「Revision Time:","modify_format":"YYYY-MM-DD HH:mm:ss」"},"baidu":{"token":"ff100361cdce95dd4c8fb96b4009f7bc"},"sitemap":{"hostname":"http://www.treenewbee.top"},"donate":{"wechat":"http://weixin.png","alipay":"http://alipay.png","title":"","button":"赏","alipayText":"支付宝打赏","wechatText":"微信打赏"},"edit-link":{"base":"https://github.com/dododream/edit","label":"Edit This Page"},"splitter":{},"toggle-chapters":{},"highlight":{},"fontsettings":{"theme":"white","family":"sans","size":2},"livereload":{}};
    gitbook.start(config);
});
</script>

        <!-- body:end -->
    </body>
    <!-- End of book Python爬虫课程讲义 -->
</html>
